###############################################
#model used for Benford Law testing a sample
###############################################
#clear all variables in workspace
rm(list=ls()) 


options(scipen=999, digits=2)   #   disable scientific notation 

#LOAD LIBRARIES
library(tidyverse)
library(benford.analysis)


#---- Set Working Directory  ------------------------
setwd("/Users/fred/Desktop/RPNP/")    # change to the directory containing your input csv file
outFile<-"BenfordFirstTwo_MG.csv"

#read DB in a dataset named 'bd'

#csv from Macbook
#d2017<- read.csv2(header = TRUE, "/Users/fred/Downloads/2017.csv")
d2018<- read.csv2(header = TRUE, "/Users/fred/Downloads/2018.csv")
d2019<- read.csv2(header = TRUE, "/Users/fred/Downloads/2019.csv")
d2020<- read.csv2(header = TRUE, "/Users/fred/Downloads/2020.csv")
d2021<- read.csv2(header = TRUE, "/Users/fred/Downloads/2021.csv")
d2022<- read.csv2(header = TRUE, "/Users/fred/Downloads/2022.csv")
#csv from Windows
#df<- read.csv2("C:/Users/Frederico/Desktop/RPNP/2017.csv", sep = ",")
#d2017<- read.csv2(header = TRUE, "C:/Users/Frederico/Desktop/RPNP/2017.csv")
#d2018<- read.csv2(header = TRUE, "C:/Users/Frederico/Desktop/RPNP/2018.csv")
#d2019<- read.csv2(header = TRUE, "C:/Users/Frederico/Desktop/RPNP/2019.csv")
#d2020<- read.csv2(header = TRUE, "C:/Users/Frederico/Desktop/RPNP/2020.csv")


############
#---- removing rows with 0 (zero) as Valor.Nao.Processado ) 
#this will exclude Restos a Pagar Processados -----------------------
#d2017<-d2017[!grepl("***",fixed = TRUE, d2017$CNPJ.CPF),]
d2018<-d2018[!grepl("0",fixed = TRUE, d2018$Valor.Inscrito.Nao.Processado),]
d2019<-d2019[!grepl("0",fixed = TRUE, d2019$Valor.Inscrito.Nao.Processado),]
d2020<-d2020[!grepl("0",fixed = TRUE, d2020$Valor.Inscrito.Nao.Processado),]
d2021<-d2021[!grepl("0",fixed = TRUE, d2021$Valor.Inscrito.Nao.Processado),]
d2022<-d2022[!grepl("0",fixed = TRUE, d2022$Valor.Inscrito.Nao.Processado),]


#---- insert column with YEAR as identifier -----------------------
#d2017$Ano = "2017"
d2018$Ano = "2018"
d2019$Ano = "2019"
d2020$Ano = "2020"
d2021$Ano = "2021"
d2022$Ano = "2022"

#---- adapt text to number -----------------------
d2018$Valor.Inscrito.Processado<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2018$Valor.Inscrito.Processado, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2018$Valor.Inscrito.Nao.Processado<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "",d2018$Valor.Inscrito.Nao.Processado, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2018$Valor.Pago.no.Ano<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2018$Valor.Pago.no.Ano, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2018$Outros.Valores.Retidos<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2018$Outros.Valores.Retidos, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2018$Valor.a.Pagar<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2018$Valor.a.Pagar, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)

d2019$Valor.Inscrito.Processado<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2019$Valor.Inscrito.Processado, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2019$Valor.Inscrito.Nao.Processado<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "",d2019$Valor.Inscrito.Nao.Processado, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2019$Valor.Pago.no.Ano<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2019$Valor.Pago.no.Ano, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2019$Outros.Valores.Retidos<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2019$Outros.Valores.Retidos, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2019$Valor.a.Pagar<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2019$Valor.a.Pagar, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)

d2020$Valor.Inscrito.Processado<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2020$Valor.Inscrito.Processado, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2020$Valor.Inscrito.Nao.Processado<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "",d2020$Valor.Inscrito.Nao.Processado, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2020$Valor.Pago.no.Ano<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2020$Valor.Pago.no.Ano, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2020$Outros.Valores.Retidos<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2020$Outros.Valores.Retidos, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2020$Valor.a.Pagar<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2020$Valor.a.Pagar, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)

d2021$Valor.Inscrito.Processado<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2021$Valor.Inscrito.Processado, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2021$Valor.Inscrito.Nao.Processado<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "",d2021$Valor.Inscrito.Nao.Processado, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2021$Valor.Pago.no.Ano<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2021$Valor.Pago.no.Ano, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2021$Outros.Valores.Retidos<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2021$Outros.Valores.Retidos, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2021$Valor.a.Pagar<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2021$Valor.a.Pagar, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)

d2022$Valor.Inscrito.Processado<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2022$Valor.Inscrito.Processado, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2022$Valor.Inscrito.Nao.Processado<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "",d2022$Valor.Inscrito.Nao.Processado, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2022$Valor.Pago.no.Ano<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2022$Valor.Pago.no.Ano, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2022$Outros.Valores.Retidos<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2022$Outros.Valores.Retidos, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)
d2022$Valor.a.Pagar<-as.numeric(sub(",", ".", sub(".", "", sub(".", "", sub(".", "", d2022$Valor.a.Pagar, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE), digits = 2)


#################################
#---- Create Descriptive Analisys  -----------------------
DescriptiveRPNP<-as.data.frame(0)

#---- Count rows in each year  -----------------------

#DescriptiveRPNP[1,1]<-as.numeric(nrow(d2017))
DescriptiveRPNP[1,1]<-as.numeric(nrow(d2018))
DescriptiveRPNP[2,1]<-as.numeric(nrow(d2019))
DescriptiveRPNP[3,1]<-as.numeric(nrow(d2020))
DescriptiveRPNP[4,1]<-as.numeric(nrow(d2021))
DescriptiveRPNP[5,1]<-as.numeric(nrow(d2022))

DescriptiveRPNP[6,1]<-sum(DescriptiveRPNP[1:5,1])

#---- Sum values in   -----------------------
#DescriptiveRPNP[1,2]<-sum(d2017$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[1,2]<-sum(d2018$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[2,2]<-sum(d2019$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[3,2]<-sum(d2020$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[4,2]<-sum(d2021$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[5,2]<-sum(d2022$Valor.Inscrito.Nao.Processado)

DescriptiveRPNP[6,2]<-sum(DescriptiveRPNP[1:5,2])

#DescriptiveRPNP[1,3]<-min(d2017$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[1,3]<-min(d2018$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[2,3]<-min(d2019$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[3,3]<-min(d2020$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[4,3]<-min(d2021$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[5,3]<-min(d2022$Valor.Inscrito.Nao.Processado)

DescriptiveRPNP[6,3]<-min(DescriptiveRPNP[1:5,3])

#DescriptiveRPNP[1,4]<-max(d2017$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[1,4]<-max(d2018$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[2,4]<-max(d2019$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[3,4]<-max(d2020$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[4,4]<-max(d2021$Valor.Inscrito.Nao.Processado)
DescriptiveRPNP[5,4]<-max(d2022$Valor.Inscrito.Nao.Processado)

DescriptiveRPNP[6,4]<-max(DescriptiveRPNP[1:5,4])

rownames(DescriptiveRPNP)=c(2018,2019,2020,2021,2022,"Total")
colnames(DescriptiveRPNP)=c("Observations","Summation","Min","Max")


ObsRPNP<-format(DescriptiveRPNP[5,1], big.mark=".",decimal.mark =",")
TotRPNP<-format(DescriptiveRPNP[5,2], nsmall=2, big.mark=".", decimal.mark =",")
MinRPNP <- format(DescriptiveRPNP[5,3], big.mark=".")
MaxRPNP <- format(DescriptiveRPNP[5,4],nsmall=2,scientific=FALSE, big.mark=".", decimal.mark =",")

ObsRPNP
TotRPNP
MinRPNP
MaxRPNP


#---- removing rows with persons CPF (that contains "***") 
#this will retain only CNPJ for companies -----------------------
#d2017<-d2017[!grepl("***",fixed = TRUE, d2017$CNPJ.CPF),]
d2018<-d2018[!grepl("***",fixed = TRUE, d2018$CNPJ.CPF),]
d2019<-d2019[!grepl("***",fixed = TRUE, d2019$CNPJ.CPF),]
d2020<-d2020[!grepl("***",fixed = TRUE, d2020$CNPJ.CPF),]
d2021<-d2021[!grepl("***",fixed = TRUE, d2021$CNPJ.CPF),]
d2022<-d2022[!grepl("***",fixed = TRUE, d2022$CNPJ.CPF),]

sum(df$RPNP)*100000

#---- Count total CNPJ and insert into the Descriptive Analisys  -
DescriptiveCNPJ<-as.data.frame(0)
DescriptiveCNPJ[1,1]<-nrow(as.data.frame(table(d2018$Orgao)))
DescriptiveCNPJ[2,1]<-nrow(as.data.frame(table(d2019$Orgao)))
DescriptiveCNPJ[3,1]<-nrow(as.data.frame(table(d2020$Orgao)))
DescriptiveCNPJ[4,1]<-nrow(as.data.frame(table(d2021$Orgao)))
DescriptiveCNPJ[5,1]<-nrow(as.data.frame(table(d2022$Orgao)))

DescriptiveCNPJ[6,1]<-sum(DescriptiveCNPJ[1:5,1])

#---- Percentage of CASES by Company per YEAR  -

DescriptiveCNPJ[1,2]<-(DescriptiveCNPJ[1,1]/DescriptiveCNPJ[6,1])*1000000
DescriptiveCNPJ[2,2]<-(DescriptiveCNPJ[2,1]/DescriptiveCNPJ[6,1])*1000000
DescriptiveCNPJ[3,2]<-(DescriptiveCNPJ[3,1]/DescriptiveCNPJ[6,1])*1000000
DescriptiveCNPJ[4,2]<-(DescriptiveCNPJ[4,1]/DescriptiveCNPJ[6,1])*1000000
DescriptiveCNPJ[5,2]<-(DescriptiveCNPJ[5,1]/DescriptiveCNPJ[6,1])*1000000
#---- Percentage variation CASES between LAST year and FIRST year
DescriptiveCNPJ[6,2]<-(DescriptiveCNPJ[5,1]/DescriptiveCNPJ[1,1]-1)*1000000


#---- SUM of values, after filtering only CNPJ (Companies) 
DescriptiveCNPJ[1,3]<-sum(d2018$Valor.Inscrito.Nao.Processado)
DescriptiveCNPJ[2,3]<-sum(d2019$Valor.Inscrito.Nao.Processado)
DescriptiveCNPJ[3,3]<-sum(d2020$Valor.Inscrito.Nao.Processado)
DescriptiveCNPJ[4,3]<-sum(d2021$Valor.Inscrito.Nao.Processado)
DescriptiveCNPJ[5,3]<-sum(d2022$Valor.Inscrito.Nao.Processado)

DescriptiveCNPJ[6,3]<-sum(DescriptiveCNPJ[1:5,3])


#---- Percentage of VALUES by Company per YEAR  -
DescriptiveCNPJ[1,4]<-(DescriptiveCNPJ[1,3]/DescriptiveCNPJ[5,3])*1000000
DescriptiveCNPJ[2,4]<-(DescriptiveCNPJ[2,3]/DescriptiveCNPJ[5,3])*1000000
DescriptiveCNPJ[3,4]<-(DescriptiveCNPJ[3,3]/DescriptiveCNPJ[5,3])*1000000
DescriptiveCNPJ[4,4]<-(DescriptiveCNPJ[4,3]/DescriptiveCNPJ[5,3])*1000000
DescriptiveCNPJ[5,4]<-(DescriptiveCNPJ[5,3]/DescriptiveCNPJ[5,3])*1000000
#---- Percentage variation VALUE between LAST year and FIRST year
DescriptiveCNPJ[6,4]<-(DescriptiveCNPJ[5,3]/DescriptiveCNPJ[1,3]-1)*1000000

rownames(DescriptiveCNPJ)=c(2018,2019,2020,2021,2022,"Total")
colnames(DescriptiveCNPJ)<-c("nCNPJ","%N","vCNPJ","%V")
                             
#---- Combine all data in one single Data Base  -----------------------
library(tidyverse)
df<-dplyr::bind_rows(d2018,d2019,d2020,d2021,d2022)



#---- Change Columns names  -----------------------
colnames(df)<-c("CO_Orgao","Orgao","CO_Acao","Acao","CO_Elemento","Elemento","CO_Item","Item","Fornecedor","CNPJ","RPP","RPNP","Pago","Retido","Saldo","Ano")


#---- visual inspection -----------------------
class(df$RPP)
class(df$RPNP)
class(df$Pago)
class(df$Saldo)
View(df)
str(df)

summary(df$RPNP)

#treating variable RPNP 
###############################################
# Multiply records 0.01<=x<10 by 1000 to move decimal point to the right
df$RPNP <- ifelse(df$RPNP <10 & df$RPNP >=0.01,
                  df$RPNP*1000,df$RPNP)

df<-df[!(df$RPNP<0.10),] # Delete records < 0.01
#can be any value, compatible with a log10 



#---- Insert filtered data into the Descriptive Analisys  -----------------------
DescriptiveRPNP[7,1]<-as.numeric(nrow(df))
DescriptiveRPNP[7,2]<-sum(df$RPNP)
DescriptiveRPNP[7,3]<-min(df$RPNP)
DescriptiveRPNP[7,4]<-max(df$RPNP)

rownames(DescriptiveRPNP)=c(2018,2019,2020,2021,2022,"Total","CNPJ")


#---- Create Yaer by Year analisys of dataset
d1<-as.data.frame(0)
d1[1,1]<-as.data.frame(table(df$Ano))[1,2]
d1[2,1]<-as.data.frame(table(df$Ano))[2,2]
d1[3,1]<-as.data.frame(table(df$Ano))[3,2]
d1[4,1]<-as.data.frame(table(df$Ano))[4,2]
d1[5,1]<-as.data.frame(table(df$Ano))[5,2]
d1[6,1]<-sum(d1[1:5,1])

d1[1,2]<-(d1[1,1]/d1[6,1])*1000000
d1[2,2]<-(d1[2,1]/d1[6,1])*1000000
d1[3,2]<-(d1[3,1]/d1[6,1])*1000000
d1[4,2]<-(d1[4,1]/d1[6,1])*1000000
d1[5,2]<-(d1[5,1]/d1[6,1])*1000000
d1[6,2]<-sum(d1[1:5,2])/10000

library(dplyr)
d2<-df[,11:16]
d3<-d2 %>% filter(Ano == 2018) 
d1[1,3]<-sum(d3[,2])
d1[1,4]<-min(d3[,2])
d1[1,5]<-max(d3[,2])
d1[1,6]<-mean(d3[,2])
d1[1,7]<-median(d3[,2])

d3<-d2 %>% filter(Ano == 2019) 
d1[2,3]<-sum(d3[,2])
d1[2,4]<-min(d3[,2])
d1[2,5]<-max(d3[,2])
d1[2,6]<-mean(d3[,2])
d1[2,7]<-median(d3[,2])

d3<-d2 %>% filter(Ano == 2020)
d1[3,3]<-sum(d3[,2])
d1[3,4]<-min(d3[,2])
d1[3,5]<-max(d3[,2])
d1[3,6]<-mean(d3[,2])
d1[3,7]<-median(d3[,2])

d3<-d2 %>% filter(Ano == 2021)
d1[4,3]<-sum(d3[,2])
d1[4,4]<-min(d3[,2])
d1[4,5]<-max(d3[,2])
d1[4,6]<-mean(d3[,2])
d1[4,7]<-median(d3[,2])

d3<-d2 %>% filter(Ano == 2022)
d1[5,3]<-sum(d3[,2])
d1[5,4]<-min(d3[,2])
d1[5,5]<-max(d3[,2])
d1[5,6]<-mean(d3[,2])
d1[5,7]<-median(d3[,2])

d1[6,3]<-sum(df$RPNP )
d1[6,4]<-min(df$RPNP)
d1[6,5]<-max(df$RPNP)
d1[6,6]<-mean(df$RPNP)
d1[6,7]<-median(df$RPNP)
rownames(d1)<-rownames(DescriptiveCNPJ)
colnames(d1)<-c("nAno","%Ano","SumAno", "MinAno", "MaxAno","MeanAno", "MedianAno")
#---- Combine descriptive
DescriptiveCNPJ<-cbind(DescriptiveCNPJ, d1)
rm(d1,d2,d3)


ObsCNPJ<-format(DescriptiveCNPJ[6,5], big.mark=".",decimal.mark =",")
TotCNPJ<-format(DescriptiveCNPJ[6,7], nsmall=2, big.mark=".", decimal.mark =",")
MinCNPJ <- format(DescriptiveCNPJ[6,8], big.mark=".")
MaxCNPJ <- format(DescriptiveCNPJ[6,9],nsmall=2,scientific=FALSE, big.mark=".", decimal.mark =",")
MeanCNPJ <- format(DescriptiveCNPJ[6,10], big.mark=".")
MedianCNPJ <- format(DescriptiveCNPJ[6,11], big.mark=".")

ObsCNPJ
TotCNPJ
MinCNPJ
MaxCNPJ
MeanCNPJ
MedianCNPJ


###############################################
#Benford test RPNP for the First Two Digits
###############################################
library(benford.analysis)
bnf1<-benford.analysis::benford(df$RPNP, number.of.digits = 1, sign = "positive", discrete=TRUE, round=6)
bnf12<-benford.analysis::benford(df$RPNP, number.of.digits = 2, sign = "positive", discrete=TRUE, round=6)

###--- Show Benford First Two Digits Summary
bnf1
cat('Our conclusion FOR FIRST DIGIT is:',bnf1$MAD.conformity,"\n")

bnf12


###--- Create Benfod vs Data table FOR THE FIRST DIGIT
BenTest1<-as.data.frame(bnf1$bfd$digits)
BenTest1[,2]<-as.data.frame(bnf1$bfd$data.dist.freq)
BenTest1[,3]<-as.data.frame(bnf1$bfd$data.dist)
BenTest1[,4]<-as.data.frame(bnf1$bfd$benford.dist)
BenTest1[,5]<-BenTest1[,3]-BenTest1[,4]
BenTest1[,6]<-abs(BenTest1[,5])

###--- Two Sample Z-Test
#To test the hypothesis that the mean of normal sample (status=0) 
#and the actual sample (status=1) are equal (d0=0 ) 
BenTest1[, 7] <- ifelse(1 / (2 * bnf1$info$n) < BenTest1[, 6], (BenTest1[, 5] - 1 / (2 * bnf1$info$n)), 0)
BenTest1[,7]<-abs(BenTest1[,7]/sqrt(BenTest1[,4]*(1-BenTest1[,4])/bnf1$info$n))
sort(BenTest1[,7],decreasing=TRUE)

colnames(BenTest1)<-c("First.Digit","Count","Actual","Benford","Difference","AbsDiff","Z-stat")





###--- Show Benford First Two Digits Summary
bnf12

###--- Create Benfod vs Data table FOR THE FIRST TWO DIGITS
BenTest12<-as.data.frame(bnf12$bfd$digits)
BenTest12[,2]<-as.data.frame(bnf12$bfd$data.dist.freq)
BenTest12[,3]<-as.data.frame(bnf12$bfd$data.dist)
BenTest12[,4]<-as.data.frame(bnf12$bfd$benford.dist)
BenTest12[,5]<-BenTest12[,3]-BenTest12[,4]
BenTest12[,6]<-abs(BenTest12[,5])

###--- Two Sample Z-Test
#To test the hypothesis that the mean of normal sample (status=0) 
#and the actual sample (status=1) are equal (d0=0 ) 
BenTest12[,7]<-ifelse(1/(2*bnf12$info$n)<BenTest12[,6] , (BenTest12[,5]-1/(2*bnf12$info$n)) , 0)
BenTest12[,7]<-abs(BenTest12[,7]/sqrt(BenTest12[,4]*(1-BenTest12[,4])/bnf12$info$n))
sort(BenTest12[,7],decreasing=TRUE)

colnames(BenTest12)<-c("FT.Digits","Count","Actual","Benford","Difference","AbsDiff","Z-stat")

###--- Show top 5 Z-stats
top11<-suspectsTable(bnf12)[1,1]
top12<-suspectsTable(bnf12)[1,2]
top13<-as.character(BenTest12[as.numeric(suspectsTable(bnf12)[1,1])-9,7])
top5<- data.frame(top11,top12,top13)
top5[2,1]<-suspectsTable(bnf12)[2,1]
top5[2,2]<-suspectsTable(bnf12)[2,2]
top5[2,3]<-as.character(BenTest12[as.numeric(suspectsTable(bnf12)[2,1])-9,7])
top5[3,1]<-suspectsTable(bnf12)[3,1]
top5[3,2]<-suspectsTable(bnf12)[3,2]
top5[3,3]<-as.character(BenTest12[as.numeric(suspectsTable(bnf12)[3,1])-9,7])
top5[4,1]<-suspectsTable(bnf12)[4,1]
top5[4,2]<-suspectsTable(bnf12)[4,2]
top5[4,3]<-as.character(BenTest12[as.numeric(suspectsTable(bnf12)[4,1])-9,7])
top5[5,1]<-suspectsTable(bnf12)[5,1]
top5[5,2]<-suspectsTable(bnf12)[5,2]
top5[5,3]<-as.character(BenTest12[as.numeric(suspectsTable(bnf12)[5,1])-9,7])
top5[6,1]<-suspectsTable(bnf12)[6,1]
top5[6,2]<-suspectsTable(bnf12)[6,2]
top5[6,3]<-as.character(BenTest12[as.numeric(suspectsTable(bnf12)[6,1])-9,7])
colnames(top5)<-c("FTD","Abs.Diff","Z-stat")
rm(top11,top12,top13)
View(top5)
top5

## Write the results to a csv file
###############################################
cat('Our conclusion FOR TWO FIRST DIGITS is:',bnf12$MAD.conformity,"\n")
write.csv(BenTest12, file=outFile)    



###--- Create plots with ALL Benford stats
plot(bnf1, except = "none")
plot(bnf12, except = "none")


###--- Create plots with SPECIFIC Benford stats
#plot(x, except = c("digits", "second order", "summation", "mantissa", "chi square", "abs diff", "ex summation"))

#FIRST DIGIT PLOT
plot(bnf1, except = c("second order", "summation", "mantissa", "chi square", "abs diff", "ex summation",
                       multiple = FALSE),xlim=c(0,200))
#FIRST TWO DIGITS PLOT
plot(bnf12, except = c("second order", "summation", "mantissa", "chi square", "abs diff", "ex summation",
                   multiple = FALSE))



#Export charts
###############################################
plots.dir.path <- list.files(tempdir(), pattern="rs-graphics", full.names = TRUE); 
plots.png.paths <- list.files(plots.dir.path, pattern=".png", full.names = TRUE)
file.copy(from=plots.png.paths, to="/Users/fred/Desktop/RPNP/Charts/")
#file.copy(from=plots.png.paths, to="C:/Users/Frederico/Desktop/BH/Charts/")

plots.png.details <- file.info(plots.png.paths)
plots.png.details <- plots.png.details[order(plots.png.details$ctime),]
valid.png.details <- plots.png.details %>% filter(size>0)

sorted.png.names <- gsub(plots.dir.path, "/Users/fred/Desktop/RPNP/Charts/", row.names(valid.png.details), fixed=TRUE)
#sorted.png.names <- gsub(plots.dir.path, "C:/Users/Frederico/Desktop/BH/Charts/", row.names(valid.png.details), fixed=TRUE)

numbered.png.names <- paste0("/Users/fred/Desktop/RPNP/Charts/", "RPNP.", 1:length(sorted.png.names), ".png")
#numbered.png.names <- paste0("C:/Users/Frederico/Desktop/BH/Charts/","3.", 1:length(sorted.png.names), ".png")

# Rename all the .png files as: 1.png, 2.png, 3.png, and so on.
file.rename(from=sorted.png.names, to=numbered.png.names)

# Remove empty.png that was auto-generated
file.remove('/Users/fred/Desktop/RPNP/Charts/empty.png')




